Monitoring data (BEMS & DEMS):
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
from datetime import datetime
from holidays_es import Province
from heatmap import corrplot
df1 = pd.read_csv('../VEOLIA/Data/UC4-2020-11to12.csv', delimiter=';', header=None, index_col=None)
df2 = pd.read_csv('../VEOLIA/Data/UC4-2021-01to03.csv', delimiter=';', header=None, index_col=None)
df3 = pd.read_csv('../VEOLIA/Data/UC4-2021-04to06.csv', delimiter=';', header=None, index_col=None)
df = pd.concat([df1, df2, df3])
df.columns = ['location', 'meter_id', 'meter_name', 'datetime', 'value']
df1.columns = ['location', 'meter_id', 'meter_name', 'datetime', 'value']
df2.columns = ['location', 'meter_id', 'meter_name', 'datetime', 'value']
df3.columns = ['location', 'meter_id', 'meter_name', 'datetime', 'value']
variables = pd.read_csv('../VEOLIA/Data/Pilot2_Variables_UC4.csv', delimiter=';')
# useful dicts
id_to_name = dict(zip(variables['ID'], variables['Variable']))
name_to_id = dict(zip(variables['Variable'], variables['ID']))
variable_to_unit = dict(zip(variables['Variable'], variables['Unit']))
print('Dataset structure')
df.head()
Dataset structure
| location | meter_id | meter_name | datetime | value | |
|---|---|---|---|---|---|
| 0 | EDEN1C - M203299 - CORTES DE FUENSALDAÑA | 78562 | CONTADOR GAS CALDERA 1 (15 minuto) | 2020-11-01 00:00:00.000 | 1623,90 |
| 1 | EDEN1C - M203299 - CORTES DE FUENSALDAÑA | 78562 | CONTADOR GAS CALDERA 1 (15 minuto) | 2020-11-01 00:15:00.000 | 1623,90 |
| 2 | EDEN1C - M203299 - CORTES DE FUENSALDAÑA | 78562 | CONTADOR GAS CALDERA 1 (15 minuto) | 2020-11-01 00:30:00.000 | 1637,00 |
| 3 | EDEN1C - M203299 - CORTES DE FUENSALDAÑA | 78562 | CONTADOR GAS CALDERA 1 (15 minuto) | 2020-11-01 00:45:00.000 | 1649,40 |
| 4 | EDEN1C - M203299 - CORTES DE FUENSALDAÑA | 78562 | CONTADOR GAS CALDERA 1 (15 minuto) | 2020-11-01 01:00:00.000 | 1661,30 |
The datetime columnn is converted to datetime.datetime type.
def to_datetime(x):
if not isinstance(x, float):
return datetime.strptime("".join(str(x)), '%Y-%m-%d %H:%M:%S.%f')
return np.nan
df['datetime'] = df['datetime'].apply(lambda x : to_datetime(x))
df.head()
| location | meter_id | meter_name | datetime | value | |
|---|---|---|---|---|---|
| 0 | EDEN1C - M203299 - CORTES DE FUENSALDAÑA | 78562 | CONTADOR GAS CALDERA 1 (15 minuto) | 2020-11-01 00:00:00 | 1623,90 |
| 1 | EDEN1C - M203299 - CORTES DE FUENSALDAÑA | 78562 | CONTADOR GAS CALDERA 1 (15 minuto) | 2020-11-01 00:15:00 | 1623,90 |
| 2 | EDEN1C - M203299 - CORTES DE FUENSALDAÑA | 78562 | CONTADOR GAS CALDERA 1 (15 minuto) | 2020-11-01 00:30:00 | 1637,00 |
| 3 | EDEN1C - M203299 - CORTES DE FUENSALDAÑA | 78562 | CONTADOR GAS CALDERA 1 (15 minuto) | 2020-11-01 00:45:00 | 1649,40 |
| 4 | EDEN1C - M203299 - CORTES DE FUENSALDAÑA | 78562 | CONTADOR GAS CALDERA 1 (15 minuto) | 2020-11-01 01:00:00 | 1661,30 |
Unfortunately we can see that not all dataframes contain all variables:
initial_variables = df1['meter_name'].unique()
all_variables = df3['meter_name'].unique()
print('7 Variables for 11-12/2020:\n', initial_variables)
print('7 Variables for 01-03/2021\n', df2['meter_name'].unique())
print('Variables for 04-06/2020\n', all_variables)
7 Variables for 11-12/2020: ['CONTADOR GAS CALDERA 1 (15 minuto)' 'CONTADOR GAS CALDERA 2 (15 minuto)' 'Energía Caldera 1 (15 minuto)' 'Energía Caldera 2 (15 minuto)' 'Energía Primario ACS (15 minuto)' 'Energía Consumo ACS (15 minuto)' 'Energia Activa Total (15 minuto)'] 7 Variables for 01-03/2021 ['CONTADOR GAS CALDERA 1 (15 minuto)' 'CONTADOR GAS CALDERA 2 (15 minuto)' 'Energía Caldera 1 (15 minuto)' 'Energía Caldera 2 (15 minuto)' 'Energía Primario ACS (15 minuto)' 'Energía Consumo ACS (15 minuto)' 'Energia Activa Total (15 minuto)'] Variables for 04-06/2020 ['TEMPERATURA EXTERIOR (15 minuto)' 'CONTADOR GAS CALDERA 1 (15 minuto)' 'CONTADOR GAS CALDERA 2 (15 minuto)' 'Energía Caldera 1 (15 minuto)' 'Caudal Caldera 1 (15 minuto)' 'TEMPERATURA COLECTOR IMPULSION (15 minuto)' 'Volumen Caldera 1 (15 minuto)' 'Potencia Caldera 1 (15 minuto)' 'Temper Impulsion Caldera 1 (15 minuto)' 'Temper Retorno Caldera 1 (15 minuto)' 'Energía Caldera 2 (15 minuto)' 'Caudal Caldera 2 (15 minuto)' 'Volumen Caldera 2 (15 minuto)' 'Potencia Caldera 2 (15 minuto)' 'Temper Impulsion Caldera 2 (15 minuto)' 'Temper Retorno Caldera 2 (15 minuto)' 'TEMPERATUR COLECTOR RETORNO (15 minuto)' 'Energía Primario ACS (15 minuto)' 'Caudal Primario ACS (15 minuto)' 'Volumen Primario ACS (15 minuto)' 'Potencia Primario ACS (15 minuto)' 'Temper Impulsion Primario ACS (15 minuto)' 'Temper Retorno Primario ACS (15 minuto)' 'Energía Consumo ACS (15 minuto)' 'Caudal Consumo ACS (15 minuto)' 'Volumen Consumo ACS (15 minuto)' 'Potencia Consumo ACS (15 minuto)' 'TEMPERATURA IMPULSION CALEFACC (15 minuto)' 'Temper Impulsion Consumo ACS (15 minuto)' 'Temper Retorno Consumo ACS (15 minuto)' 'Energia Activa Total (15 minuto)' 'TEMPERATURA DEPOSITO 1 ACS (15 minuto)' 'TEMPERATURA DEPOSITO 2 ACS (15 minuto)' 'DEMANDA CALEFACCIÓN (15 minuto)' 'DEMANDA ACS (15 minuto)' 'DEMANDA QUEMADORES (15 minuto)']
This is the crucial date where the extra variables are added to the dataset. New variables are added the 14th of May 2021 at 09:15:00am. From then on the dataset is full.
print("Number of variables before 14/5/2021 09:15:00:", df[df.datetime < datetime(2021, 5, 14, 9, 15, 0)]['meter_id'].unique().shape[0])
print("Number of variables on 14/5/2021 after 09:15:00:", df[(df.datetime >= datetime(2021, 5, 14, 9, 15, 0)) & (df.datetime < datetime(2021, 5, 14, 9, 30, 0))]['meter_id'].unique().shape[0])
print("Number of variables on 15/5/2021:", df[df.datetime == datetime(2021, 5, 15)]['meter_id'].unique().shape[0])
print("Number of variables after 15/5/2021:", df[df.datetime > datetime(2021, 5, 15)]['meter_id'].unique().shape[0])
Number of variables before 14/5/2021 09:15:00: 7 Number of variables on 14/5/2021 after 09:15:00: 36 Number of variables on 15/5/2021: 36 Number of variables after 15/5/2021: 36
The new variables added are the following 29:
allvars = set(df3['meter_name'].unique())
oldvars = set(df1['meter_name'].unique())
new_variables = list(allvars - oldvars)
new_variables
['Caudal Primario ACS (15 minuto)', 'Temper Impulsion Primario ACS (15 minuto)', 'DEMANDA CALEFACCIÓN (15 minuto)', 'TEMPERATURA EXTERIOR (15 minuto)', 'Temper Retorno Caldera 2 (15 minuto)', 'DEMANDA QUEMADORES (15 minuto)', 'Temper Retorno Consumo ACS (15 minuto)', 'Volumen Caldera 2 (15 minuto)', 'TEMPERATUR COLECTOR RETORNO (15 minuto)', 'Caudal Consumo ACS (15 minuto)', 'Temper Impulsion Caldera 1 (15 minuto)', 'Volumen Primario ACS (15 minuto)', 'Temper Retorno Primario ACS (15 minuto)', 'DEMANDA ACS (15 minuto)', 'Potencia Caldera 1 (15 minuto)', 'Potencia Consumo ACS (15 minuto)', 'Volumen Consumo ACS (15 minuto)', 'Caudal Caldera 1 (15 minuto)', 'TEMPERATURA DEPOSITO 2 ACS (15 minuto)', 'Volumen Caldera 1 (15 minuto)', 'TEMPERATURA COLECTOR IMPULSION (15 minuto)', 'Temper Impulsion Caldera 2 (15 minuto)', 'Potencia Primario ACS (15 minuto)', 'Potencia Caldera 2 (15 minuto)', 'Temper Retorno Caldera 1 (15 minuto)', 'Temper Impulsion Consumo ACS (15 minuto)', 'TEMPERATURA DEPOSITO 1 ACS (15 minuto)', 'TEMPERATURA IMPULSION CALEFACC (15 minuto)', 'Caudal Caldera 2 (15 minuto)']
No missing data initially but there gonna be injected soon.
print("Total number of rows containing missing value:")
print(df.isna().sum().sum())
Total number of rows containing missing value: 0
First of all a dictionary of dataframes is created. Each dataframe refers to one meter / variable and contains the columns: datetime & value. Thus we now have as many timeseries datasets as the number of meters which need to be concatenated to form a unified timeseries dataset of all variables (meters). Outer join is performed so that variables that do not take values at all timesteps are retained and filled with a NaN value.
df['meter_name'].unique()
ts_dict = {}
for meter_name in df['meter_name'].unique():
ts_dict[meter_name] = df[df['meter_id'] == name_to_id[meter_name]][['datetime', 'value']]
ts_dict[meter_name].columns = ['datetime', meter_name]
ts_dict
ts = pd.DataFrame(columns=['datetime', 'nothing'])
for var in ts_dict:
ts = pd.merge(ts, ts_dict[var], on='datetime',how='outer')
ts.drop(columns=['nothing'], inplace=True)
# datetime as index
ts.index = ts['datetime']
unique_dates = ts['datetime']
ts.drop(columns=['datetime'], inplace=True)
# string to float
ts = ts.apply(lambda x: x.str.replace(',','.'))
for col in ts.columns:
ts[col] = pd.to_numeric(ts[col])
ts.head()
| CONTADOR GAS CALDERA 1 (15 minuto) | CONTADOR GAS CALDERA 2 (15 minuto) | Energía Caldera 1 (15 minuto) | Energía Caldera 2 (15 minuto) | Energía Primario ACS (15 minuto) | Energía Consumo ACS (15 minuto) | Energia Activa Total (15 minuto) | TEMPERATURA EXTERIOR (15 minuto) | Caudal Caldera 1 (15 minuto) | TEMPERATURA COLECTOR IMPULSION (15 minuto) | ... | Volumen Consumo ACS (15 minuto) | Potencia Consumo ACS (15 minuto) | TEMPERATURA IMPULSION CALEFACC (15 minuto) | Temper Impulsion Consumo ACS (15 minuto) | Temper Retorno Consumo ACS (15 minuto) | TEMPERATURA DEPOSITO 1 ACS (15 minuto) | TEMPERATURA DEPOSITO 2 ACS (15 minuto) | DEMANDA CALEFACCIÓN (15 minuto) | DEMANDA ACS (15 minuto) | DEMANDA QUEMADORES (15 minuto) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| datetime | |||||||||||||||||||||
| 2020-11-01 00:00:00 | 1623.9 | 3290.1 | 33.91 | 43.26 | 1.54 | 0.0 | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2020-11-01 00:15:00 | 1623.9 | 3290.1 | 33.91 | 43.26 | 1.54 | 0.0 | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2020-11-01 00:30:00 | 1637.0 | 3296.1 | 33.92 | 43.27 | 1.55 | 0.0 | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2020-11-01 00:45:00 | 1649.4 | 3296.1 | 33.93 | 43.27 | 1.56 | 0.0 | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2020-11-01 01:00:00 | 1661.3 | 3296.1 | 33.94 | 43.27 | 1.57 | 0.0 | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 36 columns
As timeseries do not share all the time range from 11/20 to 06/21 NaNs are used to fill the gaps. Something good to observe is that datetime column consists of 23232 datetimes which is exactly what we should observe for 4 quarters x 24 hours x n_days where n_days = 30 (Nov) + 31 (Dec) + 31 (Jan) + 28 + 31 + 30 + 31 + 30 (June) = 23232 quarterly timesteps.
It is observed that on 2021-03-19 10:15:00 all energy meters have some erroneous measurements as they stop the ascending pattern and repeat the last 3 values.
ts[datetime(2021, 3, 19, 9, 30, 0):datetime(2021, 3, 19, 11, 30, 0)].filter(regex='Energ')
| Energía Caldera 1 (15 minuto) | Energía Caldera 2 (15 minuto) | Energía Primario ACS (15 minuto) | Energía Consumo ACS (15 minuto) | Energia Activa Total (15 minuto) | |
|---|---|---|---|---|---|
| datetime | |||||
| 2021-03-19 09:30:00 | 273.62 | 316.37 | 78.86 | 339.66 | 10355.56 |
| 2021-03-19 09:45:00 | 273.65 | 316.37 | 78.88 | 339.69 | 10356.33 |
| 2021-03-19 10:00:00 | 273.68 | 316.37 | 78.89 | 339.73 | 10357.14 |
| 2021-03-19 10:15:00 | 273.59 | 316.37 | 78.85 | 339.62 | 10354.83 |
| 2021-03-19 10:30:00 | 273.62 | 316.37 | 78.86 | 339.66 | 10355.56 |
| 2021-03-19 10:45:00 | 273.65 | 316.37 | 78.88 | 339.69 | 10356.33 |
| 2021-03-19 11:00:00 | 273.68 | 316.37 | 78.89 | 339.73 | 10357.14 |
| 2021-03-19 11:15:00 | 273.70 | 316.37 | 78.89 | 339.77 | 10357.71 |
| 2021-03-19 11:30:00 | 273.71 | 316.37 | 78.89 | 339.80 | 10358.28 |
These values will be replaced first by NaN and then by middle steps of the previous and next values.
# remove dirty values
columns = ['Energía Caldera 1 (15 minuto)', 'Energía Consumo ACS (15 minuto)', 'Energia Activa Total (15 minuto)']
ts.loc[datetime(2021, 3, 19, 10, 15, 0)][columns] = np.nan
ts.loc[datetime(2021, 3, 19, 10, 30, 0)][columns] = np.nan
ts.loc[datetime(2021, 3, 19, 10, 45, 0)][columns] = np.nan
ts.loc[datetime(2021, 3, 19, 11, 00, 0)][columns] = np.nan
ts[datetime(2021, 3, 19, 9, 30, 0):datetime(2021, 3, 19, 11, 30, 0)].filter(like='Energ')
| Energía Caldera 1 (15 minuto) | Energía Caldera 2 (15 minuto) | Energía Primario ACS (15 minuto) | Energía Consumo ACS (15 minuto) | Energia Activa Total (15 minuto) | |
|---|---|---|---|---|---|
| datetime | |||||
| 2021-03-19 09:30:00 | 273.62 | 316.37 | 78.86 | 339.66 | 10355.56 |
| 2021-03-19 09:45:00 | 273.65 | 316.37 | 78.88 | 339.69 | 10356.33 |
| 2021-03-19 10:00:00 | 273.68 | 316.37 | 78.89 | 339.73 | 10357.14 |
| 2021-03-19 10:15:00 | NaN | 316.37 | 78.85 | NaN | NaN |
| 2021-03-19 10:30:00 | NaN | 316.37 | 78.86 | NaN | NaN |
| 2021-03-19 10:45:00 | NaN | 316.37 | 78.88 | NaN | NaN |
| 2021-03-19 11:00:00 | NaN | 316.37 | 78.89 | NaN | NaN |
| 2021-03-19 11:15:00 | 273.70 | 316.37 | 78.89 | 339.77 | 10357.71 |
| 2021-03-19 11:30:00 | 273.71 | 316.37 | 78.89 | 339.80 | 10358.28 |
# replace NaN values
columns = ['Energía Caldera 1 (15 minuto)', 'Energía Consumo ACS (15 minuto)', 'Energia Activa Total (15 minuto)']
ts.loc[datetime(2021, 3, 19, 10, 15, 0)]['Energía Caldera 1 (15 minuto)'] = 273.684
ts.loc[datetime(2021, 3, 19, 10, 30, 0)]['Energía Caldera 1 (15 minuto)'] = 273.688
ts.loc[datetime(2021, 3, 19, 10, 45, 0)]['Energía Caldera 1 (15 minuto)'] = 273.693
ts.loc[datetime(2021, 3, 19, 11, 00, 0)]['Energía Caldera 1 (15 minuto)'] = 273.697
ts.loc[datetime(2021, 3, 19, 10, 15, 0)]['Energía Consumo ACS (15 minuto)'] = 339.738
ts.loc[datetime(2021, 3, 19, 10, 30, 0)]['Energía Consumo ACS (15 minuto)'] = 339.746
ts.loc[datetime(2021, 3, 19, 10, 45, 0)]['Energía Consumo ACS (15 minuto)'] = 339.754
ts.loc[datetime(2021, 3, 19, 11, 00, 0)]['Energía Consumo ACS (15 minuto)'] = 339.762
ts.loc[datetime(2021, 3, 19, 10, 15, 0)]['Energia Activa Total (15 minuto)'] = 10357.14
ts.loc[datetime(2021, 3, 19, 10, 30, 0)]['Energia Activa Total (15 minuto)'] = 10357.28
ts.loc[datetime(2021, 3, 19, 10, 45, 0)]['Energia Activa Total (15 minuto)'] = 10357.42
ts.loc[datetime(2021, 3, 19, 11, 00, 0)]['Energia Activa Total (15 minuto)'] = 10357.56
ts[datetime(2021, 3, 19, 9, 30, 0):datetime(2021, 3, 19, 11, 30, 0)].filter(like='Energ')
| Energía Caldera 1 (15 minuto) | Energía Caldera 2 (15 minuto) | Energía Primario ACS (15 minuto) | Energía Consumo ACS (15 minuto) | Energia Activa Total (15 minuto) | |
|---|---|---|---|---|---|
| datetime | |||||
| 2021-03-19 09:30:00 | 273.620 | 316.37 | 78.86 | 339.660 | 10355.56 |
| 2021-03-19 09:45:00 | 273.650 | 316.37 | 78.88 | 339.690 | 10356.33 |
| 2021-03-19 10:00:00 | 273.680 | 316.37 | 78.89 | 339.730 | 10357.14 |
| 2021-03-19 10:15:00 | 273.684 | 316.37 | 78.85 | 339.738 | 10357.14 |
| 2021-03-19 10:30:00 | 273.688 | 316.37 | 78.86 | 339.746 | 10357.28 |
| 2021-03-19 10:45:00 | 273.693 | 316.37 | 78.88 | 339.754 | 10357.42 |
| 2021-03-19 11:00:00 | 273.697 | 316.37 | 78.89 | 339.762 | 10357.56 |
| 2021-03-19 11:15:00 | 273.700 | 316.37 | 78.89 | 339.770 | 10357.71 |
| 2021-03-19 11:30:00 | 273.710 | 316.37 | 78.89 | 339.800 | 10358.28 |
In this section some new time series are created from differentiating the energy time series in order to create new mean power time series that are available for all 6 months and can be useful for forecasts. These time series lack in terms of precision as the smallest decimal point refers to 1/100 MWh which leads to a precision of 10kw in contrast to pure power meters that offer precision of 0,01 kW. However in terms of total loads (where precision is less important) they are expected to provide useful forecasts
ts["Diff Load Activa Total (15 minuto)"] = ts["Energia Activa Total (15 minuto)"].diff()
ts["Diff Load Primario ACS (15 minuto)"] = ts["Energía Primario ACS (15 minuto)"].diff()
ts["Diff Load Consumo ACS (15 minuto)"] = ts["Energía Consumo ACS (15 minuto)"].diff()
ts["Diff Load Caldera 1 (15 minuto)"] = ts["Energía Caldera 1 (15 minuto)"].diff()
ts["Diff Load Caldera 2 (15 minuto)"] = ts["Energía Caldera 2 (15 minuto)"].diff()
To later compare differentiated energies (average 15min loads) with respective measured powers we need to convert power to 15min loads. The conversion is kw to Mwh for a timestep of 15 min so we need to divide powers by 4 * 1000. This conversion creates the final version of the 15 min timestep dataset
def power_to_mean_load_kwh(x):
return x / 4
def mwh_to_kwh(x):
return x * 1000
ts_new = ts[ts.filter(regex="Potencia").columns].apply(lambda x: power_to_mean_load_kwh(x))
ts_new.columns = [i.replace("Potencia", "True Load") for i in ts_new.columns]
ts15 = pd.concat([ts, ts_new], axis=1)
ts15[ts.filter(regex="Diff Load").columns] = ts15[ts15.filter(regex="Diff Load").columns].apply(lambda x: mwh_to_kwh(x))
# drop potencia true loads have been created instead
ts15.drop(columns=ts15.filter(like='Potencia'), inplace=True)
# replace unknown variables with NaN instead of zeros that have occured due to operations
# new variables that start later than 14/5
ts15.loc[:datetime(2021, 5, 14, 23, 45, 0)][new_variables] = np.nan
# this is the date where energy measurements 1st appear
ts15.loc[:datetime(2020, 11, 5, 23, 45, 0)][ts15.filter(like='Diff Load').columns.tolist()] = np.nan
ts15.to_csv("../VEOLIA/artifacts/timeseries_15min.csv")
#update units dict
new_cols = ts15.filter(like='Diff').columns.tolist() + ts15.filter(like='True').columns.tolist()
new_var_to_unit = {c:'kWh' for c in new_cols}
variable_to_unit.update(new_var_to_unit)
print('15min dataset:')
ts15.head(3)
/home/sotiris/opt/inergy/venv/lib/python3.9/site-packages/pandas/core/frame.py:3607: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy self._set_item(key, value)
15min dataset:
| CONTADOR GAS CALDERA 1 (15 minuto) | CONTADOR GAS CALDERA 2 (15 minuto) | Energía Caldera 1 (15 minuto) | Energía Caldera 2 (15 minuto) | Energía Primario ACS (15 minuto) | Energía Consumo ACS (15 minuto) | Energia Activa Total (15 minuto) | TEMPERATURA EXTERIOR (15 minuto) | Caudal Caldera 1 (15 minuto) | TEMPERATURA COLECTOR IMPULSION (15 minuto) | ... | DEMANDA QUEMADORES (15 minuto) | Diff Load Activa Total (15 minuto) | Diff Load Primario ACS (15 minuto) | Diff Load Consumo ACS (15 minuto) | Diff Load Caldera 1 (15 minuto) | Diff Load Caldera 2 (15 minuto) | True Load Caldera 1 (15 minuto) | True Load Caldera 2 (15 minuto) | True Load Primario ACS (15 minuto) | True Load Consumo ACS (15 minuto) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| datetime | |||||||||||||||||||||
| 2020-11-01 00:00:00 | 1623.9 | 3290.1 | 33.91 | 43.26 | 1.54 | 0.0 | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2020-11-01 00:15:00 | 1623.9 | 3290.1 | 33.91 | 43.26 | 1.54 | 0.0 | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2020-11-01 00:30:00 | 1637.0 | 3296.1 | 33.92 | 43.27 | 1.55 | 0.0 | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 rows × 41 columns
We create a new load only dataset and change its timestep to 30min and 60min in order to render diff loads more similar to true loads. For cumulative measurements (loads, energies) aggregation should be sum. For the rest of measurements (flows, temperature, powers) we need to consider a mean aggregator.
load30 = ts15.filter(like='Load').resample('30T').sum()
energy30 = ts15.filter(like='Energ').resample('30T').max()
volume30 = ts15.filter(like='Volumen').resample('30T').max()
power30 = ts15.filter(like='Potencia').resample('30T').mean()
temp30 = ts15.filter(regex='Temp|TEMP').resample('30T').mean()
demanda30 = ts15.filter(like='DEMANDA').resample('30T').mean()
flow30 = ts15.filter(like='Caudal').resample('30T').mean()
contador30 = ts15.filter(like='CONTADOR').resample('30T').max()
ts30 = pd.concat([load30, energy30, volume30, power30, temp30, demanda30, flow30, contador30], axis=1)
ts30.columns = [col.replace('15', '30') for col in ts30.columns]
# drop potencia as s true loads have been created instead
ts30.drop(columns=ts30.filter(like='Potencia'), inplace=True)
# replace unknown variables with NaN instead of zeros that have occured due to differentiation
# new variables that start later than 14/5
ts30.loc[:datetime(2021, 5, 14, 23, 30, 0)][new_variables] = np.nan
# this is the date where energy measurements 1st appear
ts30.loc[:datetime(2020, 11, 5, 23, 30, 0)][ts30.filter(like='Diff Load').columns.tolist()] = np.nan
#save csv
ts30.to_csv("../VEOLIA/artifacts/timeseries_30min.csv")
print('30min dataset:')
ts30.head()
/home/sotiris/opt/inergy/venv/lib/python3.9/site-packages/pandas/core/frame.py:3607: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy self._set_item(key, value)
30min dataset:
| Diff Load Activa Total (30 minuto) | Diff Load Primario ACS (30 minuto) | Diff Load Consumo ACS (30 minuto) | Diff Load Caldera 1 (30 minuto) | Diff Load Caldera 2 (30 minuto) | True Load Caldera 1 (30 minuto) | True Load Caldera 2 (30 minuto) | True Load Primario ACS (30 minuto) | True Load Consumo ACS (30 minuto) | Energía Caldera 1 (30 minuto) | ... | TEMPERATURA DEPOSITO 2 ACS (30 minuto) | DEMANDA CALEFACCIÓN (30 minuto) | DEMANDA ACS (30 minuto) | DEMANDA QUEMADORES (30 minuto) | Caudal Caldera 1 (30 minuto) | Caudal Caldera 2 (30 minuto) | Caudal Primario ACS (30 minuto) | Caudal Consumo ACS (30 minuto) | CONTADOR GAS CALDERA 1 (30 minuto) | CONTADOR GAS CALDERA 2 (30 minuto) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| datetime | |||||||||||||||||||||
| 2020-11-01 00:00:00 | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 33.91 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1623.9 | 3290.1 |
| 2020-11-01 00:30:00 | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 33.93 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1649.4 | 3296.1 |
| 2020-11-01 01:00:00 | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 33.95 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1673.1 | 3296.1 |
| 2020-11-01 01:30:00 | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 33.97 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1696.7 | 3296.1 |
| 2020-11-01 02:00:00 | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 33.99 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1720.3 | 3296.1 |
5 rows × 41 columns
load60 = ts15.filter(like='Load').resample('60T').sum()
energy60 = ts15.filter(like='Energ').resample('60T').max()
volume60 = ts15.filter(like='Volumen').resample('60T').max()
power60 = ts15.filter(like='Potencia').resample('60T').mean()
temp60 = ts15.filter(regex='Temp|TEMP').resample('60T').mean()
demanda60 = ts15.filter(like='DEMANDA').resample('60T').mean()
flow60 = ts15.filter(like='Caudal').resample('60T').mean()
contador60 = ts15.filter(like='CONTADOR').resample('60T').max()
ts60 = pd.concat([load60, energy60, volume60, power60, temp60, demanda60, flow60, contador60], axis=1)
ts60.columns = [col.replace('15', '60') for col in ts60.columns]
# drop potencia as true loads have been created instead
ts60.drop(columns=ts60.filter(like='Potencia'), inplace=True)
# replace unknown variables with NaN instead of zeros that have occured due to differentiation
# new variables that start later than 14/5
ts60.loc[:datetime(2021, 5, 14, 23, 0, 0)][new_variables] = np.nan
# this is a day after the date where energy measurements 1st appear
ts60.loc[:datetime(2020, 11, 5, 23, 0, 0)][ts60.filter(like='Diff Load').columns.tolist()] = np.nan
#save csv
ts60.to_csv("../VEOLIA/artifacts/timeseries_60min.csv")
print('60min dataset:')
ts60.head()
/home/sotiris/opt/inergy/venv/lib/python3.9/site-packages/pandas/core/frame.py:3607: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy self._set_item(key, value)
60min dataset:
| Diff Load Activa Total (60 minuto) | Diff Load Primario ACS (60 minuto) | Diff Load Consumo ACS (60 minuto) | Diff Load Caldera 1 (60 minuto) | Diff Load Caldera 2 (60 minuto) | True Load Caldera 1 (60 minuto) | True Load Caldera 2 (60 minuto) | True Load Primario ACS (60 minuto) | True Load Consumo ACS (60 minuto) | Energía Caldera 1 (60 minuto) | ... | TEMPERATURA DEPOSITO 2 ACS (60 minuto) | DEMANDA CALEFACCIÓN (60 minuto) | DEMANDA ACS (60 minuto) | DEMANDA QUEMADORES (60 minuto) | Caudal Caldera 1 (60 minuto) | Caudal Caldera 2 (60 minuto) | Caudal Primario ACS (60 minuto) | Caudal Consumo ACS (60 minuto) | CONTADOR GAS CALDERA 1 (60 minuto) | CONTADOR GAS CALDERA 2 (60 minuto) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| datetime | |||||||||||||||||||||
| 2020-11-01 00:00:00 | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 33.93 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1649.4 | 3296.1 |
| 2020-11-01 01:00:00 | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 33.97 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1696.7 | 3296.1 |
| 2020-11-01 02:00:00 | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 34.00 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1720.3 | 3296.1 |
| 2020-11-01 03:00:00 | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 34.00 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1720.3 | 3296.1 |
| 2020-11-01 04:00:00 | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 34.00 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1722.7 | 3296.1 |
5 rows × 41 columns
In this section a calendar is created containing auxiliary info for all unique dates that appear in the core dataset. Holidays are exracted for the province of Valladolid using the holiday-es python package.
def isholiday(x, holiday_list):
if x in holiday_list:
return True
return False
def isweekend(x):
if x == 6 or x ==0:
return True
return False
def create_calendar(timestep_minutes):
unique_dates = {'15':ts15, '30': ts30, '60': ts60}
calendar = pd.DataFrame(unique_dates[str(timestep_minutes)].index.tolist(), columns=['datetime'])
calendar['year'] = calendar['datetime'].apply(lambda x : x.year)
calendar['month'] = calendar['datetime'].apply(lambda x : x.month)
calendar['day'] = calendar['datetime'].apply(lambda x : x.day)
calendar['hour'] = calendar['datetime'].apply(lambda x : x.hour)
calendar['minute'] = calendar['datetime'].apply(lambda x : x.minute)
calendar['second'] = calendar['datetime'].apply(lambda x : x.second)
calendar['weekday'] = calendar['datetime'].apply(lambda x : x.weekday())
calendar['weekend'] = calendar['weekday'].apply(lambda x: isweekend(x))
national_holidays = Province(name="valladolid").national_holidays()
regional_holidays = Province(name="valladolid").regional_holidays()
local_holidays = Province(name="valladolid").local_holidays()
holiday_list = national_holidays + regional_holidays + local_holidays
calendar['holiday'] = calendar['datetime'].apply(lambda x : isholiday(x.date(), holiday_list))
return calendar
calendar = create_calendar(15)
print("15 min Calendar dataset structure:")
calendar.head()
15 min Calendar dataset structure:
| datetime | year | month | day | hour | minute | second | weekday | weekend | holiday | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020-11-01 00:00:00 | 2020 | 11 | 1 | 0 | 0 | 0 | 6 | True | False |
| 1 | 2020-11-01 00:15:00 | 2020 | 11 | 1 | 0 | 15 | 0 | 6 | True | False |
| 2 | 2020-11-01 00:30:00 | 2020 | 11 | 1 | 0 | 30 | 0 | 6 | True | False |
| 3 | 2020-11-01 00:45:00 | 2020 | 11 | 1 | 0 | 45 | 0 | 6 | True | False |
| 4 | 2020-11-01 01:00:00 | 2020 | 11 | 1 | 1 | 0 | 0 | 6 | True | False |
As only 7 holidays are included in our study period we observe 672 rows that have the holiday variable set to True. This happens because every day has 24x4 quarterly timesteps and 24x4x7 = 672
print('Total quarterly records with True holiday label')
calendar['holiday'].sum()
Total quarterly records with True holiday label
672
At this point mixed dataset (timeseries + calendar features) are also created.
ts15_plus_time = pd.merge(ts15, create_calendar(15), how='left',left_index=True,right_on='datetime').set_index('datetime')
ts15_plus_time.to_csv("../VEOLIA/artifacts/timeseries_plus_time_15min.csv")
ts30_plus_time = pd.merge(ts30, create_calendar(30), how='left',left_index=True,right_on='datetime').set_index('datetime')
ts30_plus_time.to_csv("../VEOLIA/artifacts/timeseries_plus_time_30min.csv")
ts60_plus_time = pd.merge(ts60, create_calendar(60), how='left',left_index=True,right_on='datetime').set_index('datetime')
ts60_plus_time.to_csv("../VEOLIA/artifacts/timeseries_plus_time_60min.csv")
calendar.head(3)
| datetime | year | month | day | hour | minute | second | weekday | weekend | holiday | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020-11-01 00:00:00 | 2020 | 11 | 1 | 0 | 0 | 0 | 6 | True | False |
| 1 | 2020-11-01 00:15:00 | 2020 | 11 | 1 | 0 | 15 | 0 | 6 | True | False |
| 2 | 2020-11-01 00:30:00 | 2020 | 11 | 1 | 0 | 30 | 0 | 6 | True | False |
We validate again that no timesteps are missing from the dataset by ensuring that no time difference amongst successive timesteps is larger than 15 minutes
diffs = calendar['datetime'].diff()
diffs[diffs > pd.Timedelta('15M')]
/home/sotiris/opt/inergy/venv/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3441: FutureWarning: Units 'M', 'Y' and 'y' do not represent unambiguous timedelta values and will be removed in a future version exec(code_obj, self.user_global_ns, self.user_ns)
Series([], Name: datetime, dtype: timedelta64[ns])
for col in ['weekday','month', 'year', 'holiday']:
sns.countplot(calendar[col])
plt.title(col)
plt.xticks(rotation=45)
plt.show()
/home/sotiris/opt/inergy/venv/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
/home/sotiris/opt/inergy/venv/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
/home/sotiris/opt/inergy/venv/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
/home/sotiris/opt/inergy/venv/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
# pdfs
print("PDFs")
fig, axs = plt.subplots(nrows=7, ncols=6, figsize=(30, 40))
i=0
j=0
for col in ts15.columns:
if i==6 and j==5:
break
if j ==6:
j = 0
i += 1
ax = sns.histplot(x=ts15[col], kde=True, ax=axs[i, j])
plt.xticks(rotation=45)
ax.set(xlabel=variable_to_unit[col], title=col)
plt.grid()
# plt.savefig(f"./figures/{col}_pdf")
j += 1
plt.savefig(f"./figures/{col}_PDFS")
plt.show()
PDFs
# cdfs
print("CDFs")
fig, axs = plt.subplots(nrows=7, ncols=6, figsize=(30, 40))
i=0
j=0
for col in ts15.columns:
if i==6 and j==5:
break
if j ==6:
j = 0
i += 1
ax = sns.histplot(x=ts15[col], kde=True, cumulative=True, ax=axs[i, j])
plt.xticks(rotation=45)
ax.set(xlabel=variable_to_unit[col], title=col)
plt.grid()
# plt.savefig(f"./figures/{col}_cdf")
j += 1
plt.savefig(f"./figures/CDFs")
plt.show()
CDFs
Normally here we would observe purely increasing trends but probably the counters were reset in late November 2020. Unit is cubic meters (m3)
# gas meters
fig = px.line(ts15, x=ts15.index, y=ts15.filter(like="CONTADOR").columns, title='Gas meters (m3)')
fig.show()
fig = px.line(ts15, x=ts15.index, y=ts15.filter(like="Energ").columns, title='Energy variables (MWh)')
fig.show()
Machine temperatures are captured from mid May and after... Not many things to observe for now except for the pure daily patterns that are visible in the graph due to the small sample size.
fig = px.line(ts15, x=ts15.index, y=ts15.filter(regex="Temper|TEMPER").columns, title='Machine Temperatures (°C)')
fig.show()
True loads are extracted from average powers (multiplication with time [kw to kwh])
fig = px.line(ts15, x=ts15.index, y=ts15.filter(like="True Load").columns, title='Power (kWh)')
fig.show()
fig = px.line(ts15, x=ts15.index, y=ts15.filter(like="Caudal").columns, title='Flow (m3/h)')
fig.show()
fig = px.line(ts15, x=ts15.index, y=ts15.filter(like="Volume").columns, title='Volumes (m3)')
fig.show()
fig = px.line(ts15, x=ts15.index, y=ts15.filter(like="DEMANDA").columns, title='Demandas (°C)')
fig.show()
We can observe that the total active load is much larger than the other variables so it is also summed up by other variables too.
fig = px.line(ts15, x=ts15.index, y=ts15.filter(like='Diff Load').columns, title='Diff Loads (kWh)')
fig.show()
columns=ts15.filter(like="Diff Load").columns.tolist() + ts15.filter(like="True Load").columns.tolist()
fig = px.line(ts15, x=ts15.index, y=columns, title='Average Loads (kWh)')
fig.show()
There is a pretty decent overlapping amongst corresponding true / diff time series.
ts15[["True Load Caldera 1 (15 minuto)", "Diff Load Caldera 1 (15 minuto)"]].tail(20)
ts15[["True Load Consumo ACS (15 minuto)", "Diff Load Consumo ACS (15 minuto)"]].tail(20)
| True Load Consumo ACS (15 minuto) | Diff Load Consumo ACS (15 minuto) | |
|---|---|---|
| datetime | ||
| 2021-06-30 19:00:00 | 24.0075 | 20.0 |
| 2021-06-30 19:15:00 | 24.2875 | 30.0 |
| 2021-06-30 19:30:00 | 23.1625 | 20.0 |
| 2021-06-30 19:45:00 | 23.1625 | 30.0 |
| 2021-06-30 20:00:00 | 23.1625 | 20.0 |
| 2021-06-30 20:15:00 | 24.0075 | 20.0 |
| 2021-06-30 20:30:00 | 24.2875 | 20.0 |
| 2021-06-30 20:45:00 | 24.0075 | 30.0 |
| 2021-06-30 21:00:00 | 22.5975 | 20.0 |
| 2021-06-30 21:15:00 | 22.5975 | 20.0 |
| 2021-06-30 21:30:00 | 23.7250 | 30.0 |
| 2021-06-30 21:45:00 | 23.4425 | 20.0 |
| 2021-06-30 22:00:00 | 24.2875 | 30.0 |
| 2021-06-30 22:15:00 | 23.1625 | 20.0 |
| 2021-06-30 22:30:00 | 24.2875 | 20.0 |
| 2021-06-30 22:45:00 | 25.6975 | 30.0 |
| 2021-06-30 23:00:00 | 27.3875 | 30.0 |
| 2021-06-30 23:15:00 | 27.6700 | 20.0 |
| 2021-06-30 23:30:00 | 25.6975 | 30.0 |
| 2021-06-30 23:45:00 | 27.1075 | 30.0 |
columns=ts30.filter(like="Diff Load").columns.tolist() + ts30.filter(like="True Load").columns.tolist()
fig = px.line(ts30, x=ts30.index, y=columns, title='Average Loads 30min (Differentiation vs True) [kWh]')
fig.show()
ts30[["True Load Caldera 1 (30 minuto)", "Diff Load Caldera 1 (30 minuto)"]].tail(20)
| True Load Caldera 1 (30 minuto) | Diff Load Caldera 1 (30 minuto) | |
|---|---|---|
| datetime | ||
| 2021-06-30 14:00:00 | 9.9000 | 10.0 |
| 2021-06-30 14:30:00 | 10.1475 | 20.0 |
| 2021-06-30 15:00:00 | 10.8900 | 0.0 |
| 2021-06-30 15:30:00 | 19.5525 | 20.0 |
| 2021-06-30 16:00:00 | 8.9100 | 20.0 |
| 2021-06-30 16:30:00 | 14.1050 | 0.0 |
| 2021-06-30 17:00:00 | 19.8000 | 20.0 |
| 2021-06-30 17:30:00 | 0.4950 | 10.0 |
| 2021-06-30 18:00:00 | 9.6525 | 10.0 |
| 2021-06-30 18:30:00 | 10.3950 | 10.0 |
| 2021-06-30 19:00:00 | 0.0000 | 0.0 |
| 2021-06-30 19:30:00 | 9.4050 | 10.0 |
| 2021-06-30 20:00:00 | 9.4050 | 10.0 |
| 2021-06-30 20:30:00 | 0.0000 | 0.0 |
| 2021-06-30 21:00:00 | 19.5525 | 10.0 |
| 2021-06-30 21:30:00 | 0.0000 | 10.0 |
| 2021-06-30 22:00:00 | 19.3025 | 20.0 |
| 2021-06-30 22:30:00 | 15.5900 | 20.0 |
| 2021-06-30 23:00:00 | 14.1050 | 10.0 |
| 2021-06-30 23:30:00 | 13.1150 | 10.0 |
columns=ts60.filter(like="Diff Load").columns.tolist() + ts60.filter(like="True Load").columns.tolist()
fig = px.line(ts60, x=ts60.index, y=columns, title='Average Loads 60min (Differentiation vs True) [kWh]')
fig.show()
ts60[["True Load Caldera 1 (60 minuto)", "Diff Load Caldera 1 (60 minuto)"]].tail(20)
| True Load Caldera 1 (60 minuto) | Diff Load Caldera 1 (60 minuto) | |
|---|---|---|
| datetime | ||
| 2021-06-30 04:00:00 | 10.1475 | 20.0 |
| 2021-06-30 05:00:00 | 10.3950 | 10.0 |
| 2021-06-30 06:00:00 | 16.0875 | 20.0 |
| 2021-06-30 07:00:00 | 16.0875 | 20.0 |
| 2021-06-30 08:00:00 | 25.4900 | 30.0 |
| 2021-06-30 09:00:00 | 23.0150 | 10.0 |
| 2021-06-30 10:00:00 | 9.6525 | 10.0 |
| 2021-06-30 11:00:00 | 50.9825 | 50.0 |
| 2021-06-30 12:00:00 | 23.2625 | 10.0 |
| 2021-06-30 13:00:00 | 16.8300 | 20.0 |
| 2021-06-30 14:00:00 | 20.0475 | 30.0 |
| 2021-06-30 15:00:00 | 30.4425 | 20.0 |
| 2021-06-30 16:00:00 | 23.0150 | 20.0 |
| 2021-06-30 17:00:00 | 20.2950 | 30.0 |
| 2021-06-30 18:00:00 | 20.0475 | 20.0 |
| 2021-06-30 19:00:00 | 9.4050 | 10.0 |
| 2021-06-30 20:00:00 | 9.4050 | 10.0 |
| 2021-06-30 21:00:00 | 19.5525 | 20.0 |
| 2021-06-30 22:00:00 | 34.8925 | 40.0 |
| 2021-06-30 23:00:00 | 27.2200 | 20.0 |
We can observe that discrepancies are somehow mitigated amongst true and diff loads for larger timesteps. This will to process the 1 hour timestep dataset in the next steps.
fig, axes=plt.subplots(1,5)
print('Diff Load Activa Total (60 minuto) [kWh]:')
ts60_plus_time.groupby('hour')['Diff Load Activa Total (60 minuto)'].mean().plot(ax=axes[0], figsize=(25,4))
ts60_plus_time.groupby('weekday')['Diff Load Activa Total (60 minuto)'].mean().plot(ax=axes[1], figsize=(25,3))
ts60_plus_time.groupby('month')['Diff Load Activa Total (60 minuto)'].mean().plot(ax=axes[2], figsize=(25,3))
ts60_plus_time.groupby('holiday')['Diff Load Activa Total (60 minuto)'].mean().plot.bar(ax=axes[3], figsize=(25,3))
ts60_plus_time.groupby('weekend')['Diff Load Activa Total (60 minuto)'].mean().plot.bar(ax=axes[4], figsize=(25,3))
Diff Load Activa Total (60 minuto) [kWh]:
<AxesSubplot:xlabel='weekend'>
Observations:
Correlograms amongst all variables
plt.figure(figsize=(20, 16))
corrplot(ts60.corr(), size_scale=300)
plt.savefig('./figures/corrplot_60min')
Observations
Scatterplots along with their regression lines are also illustrated amongst the most iteresting couples as described above.
fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(24, 18))
fig.suptitle('Total active load vs other variables')
sns.regplot(y="Diff Load Activa Total (60 minuto)", x="TEMPERATURA EXTERIOR (60 minuto)", data=ts60, marker='.', ax=axs[0, 0]);
sns.regplot(y="Diff Load Activa Total (60 minuto)", x="DEMANDA CALEFACCIÓN (60 minuto)", data=ts60, marker='.', ax=axs[0, 1]);
sns.regplot(y="Diff Load Activa Total (60 minuto)", x="DEMANDA QUEMADORES (60 minuto)", data=ts60, marker='.', ax=axs[0, 2]);
sns.regplot(y="Diff Load Activa Total (60 minuto)", x="Diff Load Caldera 1 (60 minuto)", data=ts60, marker='.', ax=axs[1, 0]);
sns.regplot(y="Diff Load Activa Total (60 minuto)", x="Diff Load Consumo ACS (60 minuto)", data=ts60, marker='.', ax=axs[1, 1]);
sns.regplot(y="Diff Load Activa Total (60 minuto)", x="Energia Activa Total (60 minuto)", data=ts60, marker='.', ax=axs[1, 2]);
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(16,6))
fig.suptitle('ACS vs external temperature')
sns.regplot(y="True Load Consumo ACS (60 minuto)", x="TEMPERATURA EXTERIOR (60 minuto)", data=ts60, marker='.', ax=axs[0]);
sns.regplot(y="True Load Primario ACS (60 minuto)", x="TEMPERATURA EXTERIOR (60 minuto)", data=ts60, marker='.', ax=axs[1]);
fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(24,6))
fig.suptitle('Diff loads vs True loads')
sns.regplot(y="Diff Load Consumo ACS (60 minuto)", x="True Load Consumo ACS (60 minuto)", data=ts60, marker='.', ax=axs[0]);
sns.regplot(y="Diff Load Caldera 2 (60 minuto)", x="True Load Caldera 2 (60 minuto)", data=ts60, marker='.', ax=axs[1]);
sns.regplot(y="Diff Load Caldera 1 (60 minuto)", x="True Load Caldera 1 (60 minuto)", data=ts60, marker='.', ax=axs[2]);
Extra observations:
In the created datasets:
import statsmodels.api as sm
fig, ax = plt.subplots(2,1, figsize=(15, 8))
fig = sm.graphics.tsa.plot_acf(ts60[datetime(2020, 11, 6, 0, 0, 0):]['Diff Load Activa Total (60 minuto)'], lags=50, ax=ax[0])
fig = sm.graphics.tsa.plot_pacf(ts60[datetime(2020, 11, 6, 0, 0, 0):]['Diff Load Activa Total (60 minuto)'], lags=50, ax=ax[1])
plt.show()
from statsmodels.tsa.seasonal import seasonal_decompose
result = seasonal_decompose(ts60[datetime(2020, 11, 6, 0, 0, 0):]['Diff Load Activa Total (60 minuto)'], model='additive', period=24)
f = result.plot()
f.set_figheight(15)
f.set_figwidth(25)
plt.show()
result.seasonal.to_csv('../VEOLIA/artifacts/decomposition/TAL_seasonal.csv')
result.trend.to_csv('../VEOLIA/artifacts/decomposition/TAL_trend.csv')
result.resid.to_csv('../VEOLIA/artifacts/decomposition/TAL_resid.csv')
result.observed.to_csv('../VEOLIA/artifacts/decomposition/TAL_resid.csv')